import urllib.request
import re
import random
from urllib.request import Request
import chardet

def get_pageData(url):
    user_agent = [
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    ]

    headers = {'User-Agent':random.choice(user_agent)}
    requ = Request(url,headers=headers)
    page = urllib.request.urlopen(requ).read()
    page = page.decode('gbk')
    return page

url = 'http://dianying.2345.com/top/'
page = get_pageData(url)

def parse_page(page,regexExpr):
    pattern = re.compile(regexExpr,re.S)
    results = re.findall(pattern,page)
    return results

# 正则匹配 匹配的是 (.*?) 之后做输出的 
regexExpr = '<div class="txt">.*?<span class="sTit">.*?<a.*?>(.*?)</a>.*?<p class="pActor">.*?<a.*?>(.*?)</a>.*?</p>.*?<p class="pTxt.*?>(.*?)<.*?</div>'
txts = parse_page(page,regexExpr)

for txt in txts:
    print(txt)
    f=open('MovieList/电影信息.txt','a',encoding='utf8')

    moveName = txt[0]
    moveActor = txt[1]
    moveDesc =  txt[2]

     # 信息
    f.writelines('电影:'+moveName+'\n')
    f.writelines('主演:'+moveActor+'\n')
    f.writelines('简介:'+moveDesc+'\n\n\n')


regexExpr = '<div class="pic".*?<img.*?src="(.*?)"'
pics = parse_page(page,regexExpr)

# 下载图片
for i in range(len(pics)):
    imageUrl = 'http:'+pics[i]
    #  当不存在这张图片时
    with open('Images/'+txts[i][0]+'.png','wb') as f:
        f.write(urllib.request.urlopen(imageUrl).read())


